################################
## Script: 07_gpt_annotator_bioweapons.R
## Purpose: This code runs the potential matches
## from cross encoder through GPT annotator. 
## Data In:
## data/gpt4o_toannotate.rds
## Data Out:
## all files with the form
## data/gpt4o_annotations/annotate_[num].rds
## this is the gpt 4o output with annotation in message comments
## Notes:
## Need to run with an array job, with arrays 1-100
## Also need openai API key

library(tidyverse)
library(openai)
library(jsonlite)

#########################################
#### Read Files ######################
#########################################


## credentials
#Sys.setenv(
#  OPENAI_API_KEY = ''
#)


slurm_arrayid <- Sys.getenv('SLURM_ARRAY_TASK_ID')
print(slurm_arrayid)

## candidate pairs
out <- readRDS("data/gpt4o_toannotate.rds")

## split into 100 batches, reserve batch for 
## specific array

out <- split(out, 1:100)

out <- out[[as.numeric(slurm_arrayid)]]


######################################
### Prompts  #########################
######################################

prompt_compare_subject <- "You will be provided with the lists of the people, places, objects, and events discussed in two paragraphs. Based on these
lists, do the two paragraphs discuss the vast majority of the same people, places, objects, and events?"
prompt_compare_claim <- "You will be provided with the lists of descriptive, normative, conceptual, and causal claims discussed in two paragraphs. 
Based on these lists, do the two paragraphs discuss the vast majority of the same claims?" 


api_fct <- function(x){
  tryCatch(
    expr = {
      m <- create_chat_completion(model = "gpt-4o-2024-05-13",
                                  messages = list(list(role = "user",
                                                       content = x)),
                                  temperature = 0)
      m$choices$message.content
    },
    error = function(e){
      message('Caught an error!')
      NA
    }
  )    
}

annotation_claim <- list()
for(j in 1:nrow(out)){
  prompt_c <- paste0(prompt_compare_claim, "\n\n\n # **List for Paragraph 1**: ",
                     out$ego_claim[j], "\n\n\n # **List for Paragraph 2**: ",
                     out$alter_claim[j], "\n\n\n # **Your Label (Respond only 'YES' or 'NO')**: ")
  annotation_claim[[j]] <- api_fct(prompt_c)
  print(j)
}
out$gpt4o_same_claim_2 <- unlist(annotation_claim)


annotation_subject <- list()
for(j in 1:nrow(out)){
  prompt_s <- paste0(prompt_compare_subject, "\n\n\n # **List for Paragraph 1**: ",
                     out$ego_subject[j], "\n\n\n # **List for Paragraph 2**: ",
                     out$alter_subject[j], "\n\n\n # **Your Label (Respond only 'YES' or 'NO')**: ")
  annotation_subject[[j]] <- api_fct(prompt_s)
  print(j)
}
out$gpt4o_same_subject_2 <- unlist(annotation_subject)

filename <- paste0("data/gpt4o_annotations/annotate_",
                   slurm_arrayid, ".rds")

saveRDS(out, filename)










